#------------------------------------------------------------------------------
# combine all individual-level data
# this data becomes really large ... we are unable to share this data.
#==============================================================================

load_library = c('bit64','data.table','fst','future.apply','stringr','logger','vroom')
invisible(lapply(load_library, function(x) library(x, character.only=TRUE, quietly= TRUE)))

bucket = '/N/project/iuni_doctorshopping/'

extract_date_from_filename = function(outfile){
  date = str_extract(outfile,"\\d{4}-\\d{2}-\\d{2}")
  return(date)
}

extract_week_from_filename = function(outfile){
  date = str_extract(outfile,"\\d{4}W\\d{2}")
  date = gsub(' ','_',date)
  return(date)
}

# ----------
# ses data
ses_data = read_fst(file.path(bucket,'derived_v4_202101','mbr_ses','ses_ses.fst'),
	columns=c('PATID','race'), as.data.table = TRUE)

# read all files about weekly data 
all_files = dir(file.path(bucket,'projects','covid_opioid','weekly_ses'), full.names=TRUE)

ind_combined = rbindlist(lapply(all_files, function(ff) {
	
	message('now reading ... ', ff)
	
	dt = read_fst(ff, columns=c('PATID','female','YRDOB','LIS_DUAL','PRODUCT',
		'pain','backpain','neckpain','limbpain',
		'opioids','therapy','sum_opioid_days','sum_opioid_mme'), as.data.table = TRUE)
	
	# recode medicare type
	dt[LIS_DUAL != 'U', medicare := factor(LIS_DUAL, levels=c('','D','L','O'), labels=c('private','dual','lis','other'))]
	dt[, LIS_DUAL := NULL]
	dt[, PRODUCT := as.factor(PRODUCT)]
	dt[, female := as.integer(female)]

	# calculate age based on birth year
	year = extract_year_from_filename(ff)
	dt[YRDOB > 0, age := as.integer(year - YRDOB + 1)]
	dt[age >= 92, age := 90]
	dt[, YRDOB := NULL]

	# include week information
	date = extract_week_from_filename(ff)
	year = as.integer(strsplit(date,'W')[[1]][1])
	week = as.integer(strsplit(date,'W')[[1]][2])
	weeknum = as.integer(year * 52 + week)
	dt$year = year
	dt$week = week 
	dt$weeknum = weeknum	
	
	return(dt)
}))

# select the same week in both 2019/2020
target_week = ind_combined[year==2020, unique(week)] %>% sort
ind_combined = ind_combined[week %in% target_week, ]

# identify three main periods
ind_combined[week %in% 1:10, period := 1L]
ind_combined[week %in% 11:27, period := 2L]
ind_combined[week %in% 28:40, period := 3L]
ind_combined[, n_week := 1L]

# specify target sample patients
ind_combined[, targetpain := 0L]
ind_combined[backpain > 0, targetpain := 1L]
ind_combined[neckpain > 0, targetpain := 1L]
ind_combined[limbpain > 0, targetpain := 1L]

# drop missing in age + sex
message('total number of person-week is ...',
		nrow(ind_combined[(is.na(age) | is.na(female)),]) # 102751 person-week
	)

message('how many patients have missing in age and sex ... ')
print(ind_combined[(is.na(age) | is.na(female)),.N,by='PATID']) # 3395 patients

message('drop them ...')
ind_combined = ind_combined[(is.na(age) | is.na(female)) == FALSE,]

# save this data file 
#object.size(ind_combined) %>% format('Gb') # 105.3GB
write_fst(ind_combined, file.path(bucket,'projects','covid_opioid','data',
	'processed_data', 'weekly_ses_individuals.fst'), 100)


